#!/usr/bin/perl

# index-doaj.pl - get doaj content and index it

# Eric Lease Morgan <eric_morgan@infomotions.com>
# January  12, 2009 - version 1.0


# define
use constant OAIURL => 'http://www.doaj.org/oai';
use constant PREFIX => 'oai_dc';
use constant SOLR   => 'http://localhost:210/solr';

# require
use Net::OAI::Harvester;
use strict;
use WebService::Solr;

# initialize oai and solr
my $harvester = Net::OAI::Harvester->new( baseURL => OAIURL );
my $solr      = WebService::Solr->new( SOLR );

# get all records and loop through them
my $records = $harvester->listAllRecords( metadataPrefix => PREFIX );
my $id      = 0;
while ( my $record = $records->next ) {

	# increment
	$id++;
	#last if ( $id > 100 );  # comment this out to get everything
	
	# extract the desired metadata
	my $metadata     = $record->metadata;
	my $identifier   = $record->header->identifier;
	my $title        = $metadata->title      ? &strip( $metadata->title )     : '';
	my $url          = $metadata->identifier ? $metadata->identifier          : '';
	my $publisher    = $metadata->publisher  ? &strip( $metadata->publisher ) : '';
	my @all_subjects = $metadata->subject    ? $metadata->subject             : ();
	
	# normalize subjects
	my @subjects = ();
	foreach ( @all_subjects ) {
	
		s/DoajSubjectTerm: //;  # remove DOAJ label
		next if ( /LCC: / );    # don't want call numbers
		push @subjects, $_;
		
	}
	
	# echo
	print "      record: $id\n";
	print "  identifier: $identifier\n";
	print "       title: $title\n";
	print "   publisher: $publisher\n";
	foreach ( @subjects ) { print "     subject: $_\n" }
	print "         url: $url\n";
	print "\n";
		
	# create solr/lucene document
	my $solr_id        = WebService::Solr::Field->new( id        => $identifier );
	my $solr_title     = WebService::Solr::Field->new( title     => $title );
	my $solr_publisher = WebService::Solr::Field->new( publisher => $publisher );
	my $solr_url       = WebService::Solr::Field->new( url       => $url );
	
	# fill up a document
	my $doc = WebService::Solr::Document->new;
	$doc->add_fields(( $solr_id, $solr_title, $solr_publisher, $solr_url ));
	foreach ( @subjects ) {
	
		$doc->add_fields(( WebService::Solr::Field->new( subject => &strip( $_ ))));
		$doc->add_fields(( WebService::Solr::Field->new( facet_subject => &strip( $_ ))));
		
	}

	# save
	$solr->add( $doc );
	
}

# done
exit;


sub strip {

	# strip non-ascii characters; bogus!
	# see: http://www.perlmonks.org/?node_id=613773
	my $s =  shift;
	$s    =~ s/[^[:ascii:]]+//g;
	return $s;
	
}
